#!/usr/bin/perl -w
use strict;
use warnings;
use DBI;
use LWP::Protocol::https;
use LWP::UserAgent;
use Digest::MD5 qw(md5 md5_hex md5_base64);
use Encode;
use utf8;
binmode(STDIN,':encoding(utf8)');
binmode(STDOUT,':encoding(utf8)');
binmode(STDERR,':encoding(utf8)'); 
use Time::Local;
use Date::Parse;

&insert_news(0);

sub insert_news{
    my $offset= $_[0];
    my $flag=0;
    my $count;
    my $driver="DBI:mysql";
	my $database="news";
	my $host="localhost";
	my $user="root";
	my $password="192510";
	my $db=DBI->connect("$driver:database=$database;host=$host;user=$user;password=$password");
	
	#得到前几天的时间
	my $current_time=$db->prepare("select UNIX_TIMESTAMP(date_sub(curdate(),interval 7 day))");
	$current_time->execute();
	my @current_time2=$current_time->fetchrow_array;
	my $current_time3=$current_time2[0];
	
	my $url="http://www.cnvd.org.cn/flaw/listResult?max=20&offset=$offset";
	my ($ua, $request , $response , $content);
	$ua = LWP::UserAgent->new;
	my $agentstr=&randomagent;
	$ua->agent('User-Agent'=>$agentstr);
	$request = HTTP::Request->new('GET',$url);
	$response = $ua->request($request);
	
	if($response->code ne '200'){
	
	}else{
	    
    $content = $response->content;
	my $html=decode("utf-8",$content);
	
	my @array=split(/<img src="\/images\/wrang_con.gif"><\/img>.*[\r\n\t ]*<a href="/,$html);
    my $array_num=scalar @array;
	
	for(my $i=0;$i<$array_num;$i++){
     
			 #href
			 my $href=$array[$i+1];
			 my @href2=split(/title="/,$href);
			 my $href3=$href2[0];
			 $href3=~s/"//g;
			 my $rel_href="http://www.cnvd.org.cn$href3";
			 
			 #title
			 my $title=$href2[1];
			 my @title2=split(/">/,$title);
			 my $rel_title=$title2[0];
			
			 my ($ua, $request , $response , $content2);
			 $ua = LWP::UserAgent->new;
			 $request = HTTP::Request->new('GET',$rel_href);
			 $response = $ua->request($request);
			 $content2 = $response->content;
			 my $html2=decode("utf-8",$content2);
			 
			 #author
			 my @author=split(/报送者:/,$html2);
			 my @author2=split(/<div class="blkContainerSblkCon clearfix">/,$author[1]);
			 my $rel_author=$author2[0];
			 $rel_author=~s/<\/span>//g;
			 $rel_author=~s/<\/div>//g;
			 $rel_author=~s/[\r\n\t]*//g;
			 
			 #publishtime
			  my @publishtime=split(/发布时间/,$html2);
			  my @publishtime2=split(/<td class="alignRight">/,$publishtime[1]);
			  my $rel_publishtime=$publishtime2[0];
			  $rel_publishtime=~s/<\/td>//g;
			  $rel_publishtime=~s/<td>//g;
			  $rel_publishtime=~s/<tr>//g;
			  $rel_publishtime=~s/<\/tr>//g;
			  $rel_publishtime=~s/[\r\n\t]*//g;
			  
			  #content
			  my @content1=split(/漏洞描述/,$html2);
			  my @content2=split(/<td class="alignRight">/,$content1[1]);
			  my $rel_content=$content2[0];
			  $rel_content=~s/<\/td>//g;
			  $rel_content=~s/<td>//g;
			  $rel_content=~s/<tr>//g;
			  $rel_content=~s/<\/tr>//g;
			  $rel_content=~s/[\r\n\t]*//g;
			  $rel_content=~s/<br\/>//g;
			  
			  #其它
			  my $web=2;
			  my $m=$rel_title.$rel_author;
			  Encode::_utf8_off($m);
			  my $md5=md5($m);
			  
			  #只抓取前7天的信息
			  #把抓取的时间转换成时间戳来比较
			  my $bj_publishtime=str2time("$rel_publishtime GMT");
						
			  if($bj_publishtime>=$current_time3){
							 
					 my $q_rel_title=$rel_title;
					 $q_rel_title=~s/\s//g;
							 
					 my $q_rel_href=$rel_href;
					 $q_rel_href=~s/\s//g;
							 
					 my $q_rel_author=$rel_author;
					 $q_rel_author=~s/\s//g;
							 
				
							 
					if($q_rel_title ne"" and $q_rel_href ne"" and $q_rel_author ne""){
							 
						my $sth = $db->prepare(q{insert ignore into news(title,web,author,publishtime,content,link,nmd5)VALUES(?,?,?,?,?,?,?)}) or die $db->errstr;
						$sth->execute($rel_title,$web,$rel_author,$rel_publishtime,$rel_content,$rel_href,$md5) or die $db->errstr;
					}
							
					
			  }else{
						 
				return;
			 }
	 
    }
     sleep(1);
	 $offset=$offset+20;
	 &insert_news($offset);
	
	}
	
}

#模拟随机浏览器
sub randomagent
{
## FireFox
# Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0
# Mozilla/5.0 (Windows NT 6.1; WOW64; rv:13.0) Gecko/20100101 Firefox/13.0.1
# Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1
# Chrome
# Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13
# Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.168 Safari/535.19 QIHU 360EE
# Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11
# Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1
##Safari
# Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13
##Sogou
# Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0
##Baidu
# Mozilla/5.0 (Windows; U; Windows NT 6.1; zh_CN) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/18.0 BIDUBrowser/2.6 Safari/534.7
##QQ
# Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.7 (KHTML, like Gecko) Chrome/20.0.1099.0 Safari/536.7 QQBrowser/6.14.15138.201
##Maxthon
# Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.12 (KHTML, like Gecko) Maxthon/3.4.2.3000 Chrome/18.0.966.0 Safari/535.12
##LBBROWSER
# Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER
##TheWorld
# Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.79 Safari/535.11 QIHU THEWORLD
	my @agent_conf=('Mozilla/5.0 (Windows NT 6.1; WOW64; rv:12.0) Gecko/20100101 Firefox/12.0',
	'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:13.0) Gecko/20100101 Firefox/13.0.1',
	'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13',
	'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.168 Safari/535.19 QIHU 360EE',
	'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11',
	'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1',
	'Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13',
	'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
	'Mozilla/5.0 (Windows; U; Windows NT 6.1; zh_CN) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/18.0 BIDUBrowser/2.6 Safari/534.7',
	'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.12 (KHTML, like Gecko) Maxthon/3.4.2.3000 Chrome/18.0.966.0 Safari/535.12',
	'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER',
	'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.79 Safari/535.11 QIHU THEWORLD');
	my $int=int(rand(12));
	return $agent_conf[$int];
}